/**
 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

// these are available and tested for all input values, though can certainly be omitted for space...

// these numbers are for uniform input (average over uniform distribution of 32 bit input) cycle counts
// NAME       | TABLE | NON TABLE | BUILTIN | DESCRIPTION
// -----------|-------|-----------|---------|-------------
// CLZ32      |    13 |       9.6 |      20 | count of leading zeroes
// CTZ32      |    12 |        11 |      20 | count of trailing zeroes
// REVERSE32  |    21 |        22 |       - | bit reverse
// POPCOUNT32 |    18 |        20 |      22 | count of 1 bits
//

#define NEW_CLZ32 1
#define NEW_CTZ32 1
#define NEW_REVERSE32 1
#define NEW_POPCOUNT32 1

.cpu cortex-m0
.thumb
.section .text.bit_functions, "ax"

#ifdef USE_POPCOUNT32
.align 2
.global popcount32
.type popcount32,%function
.thumb_func
popcount32:
#if NEW_POPCOUNT32
    ldr r1,=#0x49249249
    lsr r2,r0,#1
    and r2,r1
    lsr r3,r0,#2
    and r3,r1
    and r0,r1
    add r0,r2
    add r0,r3      @ xx0xx0xx0xx0xx0xx0xx0xx0xx0xx0xx
    lsr r1,r0,#3
    add r0,r1      @ xx***xxx***xxx***xxx***xxx***xxx
    ldr r1,=#0xC71C71C7
    and r0,r1      @ xx000xxx000xxx000xxx000xxx000xxx
    lsr r1,r0,#6
    add r0,r1      @ **00xxxx00****00xxxx00****00xxxx
    ldr r1,=#0x04004004
    mul r0,r1
    lsr r0,#26
    bx lr
#else
    adr r1, popcount8_table
    uxtb r2, r0
    ldrb r3, [r1, r2]
    lsr r0, r0, #8
    uxtb r2, r0
    ldrb r2, [r1, r2]
    add r3, r2
    lsr r0, r0, #8
    uxtb r2, r0
    ldrb r2, [r1, r2]
    add r3, r2
    lsr r0, r0, #8
    ldrb r0, [r1, r0]
    add r0, r3
    bx lr
#endif
#endif

#ifdef USE_REVERSE32
.align 2
.global reverse32
.type reverse32,%function
.thumb_func
reverse32:
#if NEW_REVERSE32
  	ldr	r2, =#0xcccccccc
  	lsr r1, r2, #1
  	eor r1, r2
  	lsl	r3, r0, #1
  	and	r3, r1
  	and r0, r1
  	lsr r0, #1
  	orr	r0, r3
  	lsl	r3, r0, #2
  	and	r3, r2
  	and	r0, r2
  	lsr	r0, r0, #2
  	orr	r3, r0
  	ldr	r2, =#0xf0f0f0f0
  	lsl	r0, r3, #4
  	and	r0, r2
  	and	r3, r2
  	lsr	r3, r3, #4
  	orr	r0, r3
  	rev	r0, r0
  	bx	lr
#else
    adr r3, reverse8_table
    uxtb r1, r0
    lsr r0, #8
    ldrb r2, [r3, r1]
    uxtb r1, r0
    rev16 r2, r2
    ldrb r1, [r3, r1]
    orr r2, r1
    lsr r0, #8
    uxtb r1, r0
    ldrb r1, [r3, r1]
    rev r2, r2
    orr r2, r1
    lsr r1, r0, #8
    ldrb r0, [r3, r1]
    rev16 r2, r2
    orr r0, r2
    bx lr
#endif
#endif

#ifdef USE_CLZ32
.global clz32
.type clz32,%function
.thumb_func
clz32:
#if NEW_CLZ32
    adr r3, clz6_table
    lsr r1, r0, #16
    bne clz32_0_15_n
//clz32_16_31_n:
    lsr r1, r0, #10
    bne clz32_16_21_l
//clz32_22_31_n:
    lsr r1, r0, #4
    bne clz32_22_27_l
//clz32_28_31_l:
    ldrb r0, [r3, r0]
    add r0, #28 - 2 // - 2 since we're using a 4 bit not 6 bit index
    bx lr
clz32_16_21_l:
    ldrb r0, [r3, r1]
    add r0, #16
    bx lr
clz32_22_27_l:
    ldrb r0, [r3, r1]
    add r0, #22
    bx lr
clz32_0_15_n:
    lsr r0, r1, #10
    bne clz32_0_5_l
//clz32_6_15_n:
    lsr r0, r1, #4
    bne clz32_6_11_l
//clz32_12_15_l:
    ldrb r0, [r3, r1]
    add r0, #12 - 2 // - 2 since we're using a 4 bit not 6 bit index
    bx lr
clz32_0_5_l:
    ldrb r0, [r3, r0]
    bx lr
clz32_6_11_l:
    ldrb r0, [r3, r0]
    add r0, #6
    bx lr

#else

    adr r3, clz8_table
    mov r2, #24
    lsr r1, r0, #16
    beq 1f
    sub r2, #16
    mov r0, r1
1:
    lsr r1, r0, #8
    beq 1f
    sub r2, #8
    mov r0, r1
1:
    ldrb r0, [r3, r0]
    add r0, r2
    bx lr
#endif
#endif

#ifdef USE_CTZ32
.global ctz32
.type ctz32,%function
.thumb_func
ctz32:
#if NEW_CTZ32
    adr r3, ctz6_table
    lsl r1, r0, #16
    beq ctz32_16_31_n
ctz32_0_15_n:
    lsl r0, r1, #6
    beq ctz32_10_15_l
//ctz32_22_31_n:
    lsl r1, r0, #6
    beq ctz32_4_9_l
//ctz32_16_20_l:
    lsr r1, #28
    add r1, #16
    ldrb r0, [r3, r1]
    bx lr
ctz32_10_15_l:
    lsr r1, #26
    ldrb r0, [r3, r1]
    add r0, #10
    bx lr
ctz32_4_9_l:
    lsr r0, #26
    ldrb r0, [r3, r0]
    add r0, #4
    bx lr
ctz32_16_31_n:
    lsl r1, r0, #6
    beq ctz32_26_31_l
//ctz32_16_25_n:
    lsl r0, r1, #6
    beq ctz32_20_25_l
//ctz32_16_20_l:
    lsr r0, #28
    add r0, #16
    ldrb r0, [r3, r0]
    add r0, #16
    bx lr
ctz32_26_31_l:
    lsr r0, #26
    ldrb r0, [r3, r0]
    add r0, #26
    bx lr
ctz32_20_25_l:
    lsr r1, #26
    ldrb r0, [r3, r1]
    add r0, #20
    bx lr
#else
    adr r3, ctz8_table
    mov r2, #0
    lsl r1, r0, #16
    bne 1f
    add r2, #16
    lsr r0, r0, #16
1:
    lsl r1, r0, #24
    bne 1f
    add r2, #8
    rev16 r0, r0
1:
    uxtb r0, r0
    ldrb r0, [r3, r0]
    add r0, r2
    bx lr
#endif
#endif

rt0_literals:
.ltorg

.align 2
#ifdef USE_POPCOUNT32
#if !NEW_POPCOUNT32
.global popcount8_table
popcount8_table:
  .byte 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05
  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
  .byte 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08
#endif
#endif

#ifdef USE_REVERSE32
#if !NEW_REVERSE32
.global reverse8_table
reverse8_table:
  .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8
  .byte 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc
  .byte 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa
  .byte 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
  .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
  .byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
  .byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
  .byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
#endif
#endif

#ifdef USE_CLZ32
#if NEW_CLZ32
.global clz6_table
clz6_table:
  .byte 0x08-2, 0x07-2, 0x06-2, 0x06-2, 0x05-2, 0x05-2, 0x05-2, 0x05-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2
  .byte 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2
#else
.global clz8_table
clz8_table:
  .byte 0x08, 0x07, 0x06, 0x06, 0x05, 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
  .byte 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
  .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
  .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
#endif
#endif

#ifdef USE_CTZ32
.global ctz6_table
#if NEW_CTZ32
ctz6_table:
  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
#else
.global ctz8_table
ctz8_table:
  .byte 0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
#endif
#endif
